Diabetes prediction using machine learning¶
importing necessary packages¶
In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
reading the dataset and getting getting started with analysis¶
In [2]:
dataset=pd.read_csv("diabetes.csv")
In [3]:
dataset.head()
Out[3]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [4]:
dataset["Age"].max()
Out[4]:
81
In [5]:
dataset.shape
Out[5]:
(768, 9)
In [6]:
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
In [7]:
dataset.describe()
Out[7]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [8]:
dataset.isnull().sum()
Out[8]:
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
Converting Age into categorical variable¶
In [9]:
dataset['Age'].median()
Out[9]:
29.0
In [10]:
dataset.loc[dataset['Age']>=29, 'Age_mod']=0
dataset.loc[dataset['Age']< 29, 'Age_mod']=1
In [11]:
dataset.head()
Out[11]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 |
In [12]:
dataset['Age_mod'].value_counts()
Out[12]:
Age_mod 0.0 401 1.0 367 Name: count, dtype: int64
Correlation¶
In [13]:
dataset.corr()
Out[13]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | |
---|---|---|---|---|---|---|---|---|---|---|
Pregnancies | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | 0.221898 | -0.577589 |
Glucose | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.466581 | -0.246323 |
BloodPressure | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.065068 | -0.197117 |
SkinThickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 0.074752 | 0.089299 |
Insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.130548 | 0.035277 |
BMI | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.292695 | -0.093753 |
DiabetesPedigreeFunction | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.173844 | -0.050555 |
Age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | 0.238356 | -0.756872 |
Outcome | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 1.000000 | -0.312100 |
Age_mod | -0.577589 | -0.246323 | -0.197117 | 0.089299 | 0.035277 | -0.093753 | -0.050555 | -0.756872 | -0.312100 | 1.000000 |
In [14]:
sns.pairplot(dataset)
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
Out[14]:
<seaborn.axisgrid.PairGrid at 0x21f2d37dcd0>
EDA¶
In [15]:
dataset.head()
Out[15]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 |
Creating a function to get the barplots¶
In [16]:
dataset.columns
Out[16]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'], dtype='object')
Defined a function called graph that outputs the bar graph for Outcome vs desired column from the dataset¶
In [17]:
def graph(feature2, color):
print(plt.bar(dataset['Outcome'], dataset[feature2], color= color))
print(plt.xlabel('Outcome'))
print(plt.ylabel(feature2))
Pregnancies vs Outcome¶
In [18]:
graph('Pregnancies', 'purple')
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'Pregnancies')
Glucose vs Outcome¶
In [19]:
graph('Glucose', 'grey')
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'Glucose')
In [20]:
a=dataset.columns
BloodPressure vs Outcome¶
In [21]:
graph('BloodPressure', 'orange')
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'BloodPressure')
In [22]:
a
Out[22]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'], dtype='object')
SkinThickness vs Outcome¶
In [23]:
graph('SkinThickness', 'brown')
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'SkinThickness')
In [24]:
a
Out[24]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'], dtype='object')
Insulin vs Outcome¶
In [25]:
graph('Insulin', 'green')
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'Insulin')
In [26]:
a
Out[26]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'], dtype='object')
BMI vs Outcome¶
In [27]:
graph('BMI', 'magenta');
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'BMI')
DiabetesPedigreeFunction vs Outcome¶
In [28]:
graph('DiabetesPedigreeFunction', 'yellow');
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'DiabetesPedigreeFunction')
Age vs Outcome¶
In [29]:
graph('Age', 'magenta');
<BarContainer object of 768 artists> Text(0.5, 0, 'Outcome') Text(0, 0.5, 'Age')
Outliers¶
Pregnancies¶
In [30]:
sns.distplot(dataset['Pregnancies'])
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\951228210.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['Pregnancies']) C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
Out[30]:
<Axes: xlabel='Pregnancies', ylabel='Density'>
In [31]:
plt.boxplot(dataset['Pregnancies']);
In [32]:
IQR= dataset['Pregnancies'].quantile(0.75)-dataset['Pregnancies'].quantile(0.25)
IQR
Out[32]:
5.0
In [33]:
upper_preg=dataset['Pregnancies'].quantile(0.75)+(IQR*3)
upper_preg
Out[33]:
21.0
In [34]:
dataset['Pregnancies'].nunique()
Out[34]:
17
In [35]:
dataset['Pregnancies'].unique()
Out[35]:
array([ 6, 1, 8, 0, 5, 3, 10, 2, 4, 7, 9, 11, 13, 15, 17, 12, 14], dtype=int64)
In [36]:
dataset['Pregnancies'].value_counts()
Out[36]:
Pregnancies 1 135 0 111 2 103 3 75 4 68 5 57 6 50 7 45 8 38 9 28 10 24 11 11 13 10 12 9 14 2 15 1 17 1 Name: count, dtype: int64
In [37]:
50+45+38+28+24+11+10+9+2+2
Out[37]:
219
In [38]:
11+10+9+2+2
Out[38]:
34
In [39]:
768-219
Out[39]:
549
In [40]:
dataset.shape
Out[40]:
(768, 10)
In [41]:
(219*100)/768
Out[41]:
28.515625
In [42]:
(34*100)/768
Out[42]:
4.427083333333333
In [43]:
dataset['Pregnancies_mod']=dataset['Pregnancies']
In [44]:
dataset.loc[dataset['Pregnancies']> 10, 'Pregnancies_mod']=5
In [45]:
dataset['Pregnancies'].max()
Out[45]:
17
In [46]:
dataset['Pregnancies_mod'].max()
Out[46]:
10
We capped every value > 10 to 10¶
In [47]:
dataset.head()
Out[47]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 |
Glucose¶
In [48]:
sns.distplot(dataset['Glucose']);
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\3413697017.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['Glucose']); C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
In [49]:
plt.boxplot(dataset['Glucose']);
In [50]:
IQR= dataset['Glucose'].quantile(0.75)-dataset['Glucose'].quantile(0.25)
IQR
Out[50]:
41.25
In [51]:
lower_glucose=dataset['Glucose'].quantile(0.25)-(IQR*3)
lower_glucose
Out[51]:
-24.75
In [52]:
dataset['Glucose'].min()
Out[52]:
0
In [53]:
dataset['Glucose'].max()
Out[53]:
199
In [54]:
dataset['Glucose_mod']=dataset['Glucose']
In [55]:
dataset.loc[dataset['Glucose']<20, 'Glucose_mod']=20
In [56]:
dataset.head(10)
Out[56]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 |
5 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 | 0.0 | 5 | 116 |
6 | 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 | 1.0 | 3 | 78 |
7 | 10 | 115 | 0 | 0 | 0 | 35.3 | 0.134 | 29 | 0 | 0.0 | 10 | 115 |
8 | 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 | 0.0 | 2 | 197 |
9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 | 0.0 | 8 | 125 |
In [57]:
dataset['Glucose_mod'].min()
Out[57]:
20
We capped the glucose level < 20 to 20¶
Blood Pressure¶
In [58]:
sns.distplot(dataset['BloodPressure'])
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\2509938823.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['BloodPressure']) C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
Out[58]:
<Axes: xlabel='BloodPressure', ylabel='Density'>
In [59]:
plt.boxplot(dataset['BloodPressure']);
In [60]:
dataset['BloodPressure'].nunique()
Out[60]:
47
In [61]:
dataset['BloodPressure'].min()
Out[61]:
0
In [62]:
dataset['BloodPressure'].max()
Out[62]:
122
In [63]:
IQR= dataset['BloodPressure'].quantile(0.75)-dataset['BloodPressure'].quantile(0.25)
IQR
Out[63]:
18.0
In [64]:
lower_bp=dataset['BloodPressure'].quantile(0.25)-(IQR*3)
lower_bp
Out[64]:
8.0
In [65]:
upper_bp=dataset['BloodPressure'].quantile(0.75)+(IQR*3)
upper_bp
Out[65]:
134.0
In [66]:
dataset['BloodPressure_mod']=dataset['BloodPressure']
In [67]:
dataset.loc[dataset['BloodPressure']<40, 'BloodPressure_mod']=40
In [68]:
dataset.head()
Out[68]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 | 72 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 | 66 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 | 64 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 | 66 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 | 40 |
In [69]:
import matplotlib.pyplot as plt
import seaborn as sns
We capped the bp <40 to 40¶
Skin Thickness¶
In [70]:
sns.distplot(dataset['SkinThickness']);
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\1550414908.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['SkinThickness']); C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
In [71]:
plt.boxplot(dataset['SkinThickness']);
In [72]:
dataset['SkinThickness'].max()
Out[72]:
99
In [73]:
dataset['SkinThickness'].min()
Out[73]:
0
In [74]:
dataset['SkinThickness_mod']=dataset['SkinThickness']
In [75]:
dataset.loc[dataset['SkinThickness']<2, 'SkinThickness_mod']=2
In [76]:
IQ1R= dataset['SkinThickness'].quantile(0.75)-dataset['SkinThickness'].quantile(0.25)
IQ1R
Out[76]:
32.0
In [77]:
upperst=dataset['SkinThickness'].quantile(0.75)+(IQ1R*3)
upperst
Out[77]:
128.0
In [78]:
dataset.head()
Out[78]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 | 72 | 35 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 | 66 | 29 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 | 64 | 2 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 | 66 | 23 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 | 40 | 35 |
Insulin¶
In [79]:
sns.distplot(dataset['Insulin']);
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\681389340.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['Insulin']); C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
In [80]:
plt.boxplot(dataset['Insulin']);
In [81]:
dataset['Insulin'].min()
Out[81]:
0
In [82]:
dataset['Insulin'].max()
Out[82]:
846
In [83]:
dataset['Insulin_mod']=dataset['Insulin']
In [84]:
dataset.loc[dataset['Insulin']<23, 'Insulin_mod']=23
In [85]:
IQ1R= dataset['Insulin'].quantile(0.75)-dataset['Insulin'].quantile(0.25)
IQ1R
Out[85]:
127.25
In [86]:
upper_IQR=dataset['Insulin'].quantile(0.75)+(IQ1R*3)
upper_IQR
Out[86]:
509.0
In [87]:
dataset.loc[dataset['Insulin']>509, 'Insulin_mod']=509
In [88]:
dataset['Insulin_mod'].min()
Out[88]:
23
In [89]:
dataset['Insulin_mod'].max()
Out[89]:
509
In [90]:
dataset['Insulin_mod'].max()
Out[90]:
509
In [91]:
dataset.head(10)
Out[91]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 | 72 | 35 | 23 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 | 66 | 29 | 23 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 | 64 | 2 | 23 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 | 66 | 23 | 94 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 | 40 | 35 | 168 |
5 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 | 0.0 | 5 | 116 | 74 | 2 | 23 |
6 | 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 | 1.0 | 3 | 78 | 50 | 32 | 88 |
7 | 10 | 115 | 0 | 0 | 0 | 35.3 | 0.134 | 29 | 0 | 0.0 | 10 | 115 | 40 | 2 | 23 |
8 | 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 | 0.0 | 2 | 197 | 70 | 45 | 509 |
9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 | 0.0 | 8 | 125 | 96 | 2 | 23 |
BMI¶
In [92]:
sns.distplot(dataset['BMI']);
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\3186949395.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['BMI']); C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
Variance is the spread of data wrt to the mean of that column¶
In [93]:
dataset['BMI'].describe()
Out[93]:
count 768.000000 mean 31.992578 std 7.884160 min 0.000000 25% 27.300000 50% 32.000000 75% 36.600000 max 67.100000 Name: BMI, dtype: float64
In [94]:
plt.boxplot(dataset['BMI']);
In [95]:
IQ1R= dataset['BMI'].quantile(0.75)-dataset['BMI'].quantile(0.25)
IQ1R
Out[95]:
9.3
In [96]:
upper_BMI=dataset['BMI'].quantile(0.75)+(IQ1R*3)
upper_BMI
Out[96]:
64.5
In [97]:
dataset['BMI_mod']=dataset['BMI']
In [98]:
dataset.loc[dataset['BMI']<15, 'BMI_mod']=15
dataset.loc[dataset['BMI']>64.5, 'BMI_mod']=64.5
In [99]:
print("BMI:")
print(dataset['BMI'].describe())
print('------------------------------------------------------')
print("BMI_mod:")
print(dataset['BMI_mod'].describe())
BMI: count 768.000000 mean 31.992578 std 7.884160 min 0.000000 25% 27.300000 50% 32.000000 75% 36.600000 max 67.100000 Name: BMI, dtype: float64 ------------------------------------------------------ BMI_mod: count 768.000000 mean 32.204036 std 7.165761 min 15.000000 25% 27.300000 50% 32.000000 75% 36.600000 max 64.500000 Name: BMI_mod, dtype: float64
In [100]:
dataset.head()
Out[100]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | BMI_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 | 72 | 35 | 23 | 33.6 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 | 66 | 29 | 23 | 26.6 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 | 64 | 2 | 23 | 23.3 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 | 66 | 23 | 94 | 28.1 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 | 40 | 35 | 168 | 43.1 |
DiabetesPedigreeFunction¶
In [101]:
sns.distplot(dataset['DiabetesPedigreeFunction']);
C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\471758895.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(dataset['DiabetesPedigreeFunction']); C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
In [102]:
plt.boxplot(dataset['DiabetesPedigreeFunction']);
In [103]:
dataset['DiabetesPedigreeFunction'].describe()
Out[103]:
count 768.000000 mean 0.471876 std 0.331329 min 0.078000 25% 0.243750 50% 0.372500 75% 0.626250 max 2.420000 Name: DiabetesPedigreeFunction, dtype: float64
In [104]:
dataset['DiabetesPedigreeFunction'].nunique()
Out[104]:
517
In [105]:
dataset['DiabetesPedigreeFunction'].unique()
Out[105]:
array([0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158, 0.232, 0.191, 0.537, 1.441, 0.398, 0.587, 0.484, 0.551, 0.254, 0.183, 0.529, 0.704, 0.388, 0.451, 0.263, 0.205, 0.257, 0.487, 0.245, 0.337, 0.546, 0.851, 0.267, 0.188, 0.512, 0.966, 0.42 , 0.665, 0.503, 1.39 , 0.271, 0.696, 0.235, 0.721, 0.294, 1.893, 0.564, 0.586, 0.344, 0.305, 0.491, 0.526, 0.342, 0.467, 0.718, 0.962, 1.781, 0.173, 0.304, 0.27 , 0.699, 0.258, 0.203, 0.855, 0.845, 0.334, 0.189, 0.867, 0.411, 0.583, 0.231, 0.396, 0.14 , 0.391, 0.37 , 0.307, 0.102, 0.767, 0.237, 0.227, 0.698, 0.178, 0.324, 0.153, 0.165, 0.443, 0.261, 0.277, 0.761, 0.255, 0.13 , 0.323, 0.356, 0.325, 1.222, 0.179, 0.262, 0.283, 0.93 , 0.801, 0.207, 0.287, 0.336, 0.247, 0.199, 0.543, 0.192, 0.588, 0.539, 0.22 , 0.654, 0.223, 0.759, 0.26 , 0.404, 0.186, 0.278, 0.496, 0.452, 0.403, 0.741, 0.361, 1.114, 0.457, 0.647, 0.088, 0.597, 0.532, 0.703, 0.159, 0.268, 0.286, 0.318, 0.272, 0.572, 0.096, 1.4 , 0.218, 0.085, 0.399, 0.432, 1.189, 0.687, 0.137, 0.637, 0.833, 0.229, 0.817, 0.204, 0.368, 0.743, 0.722, 0.256, 0.709, 0.471, 0.495, 0.18 , 0.542, 0.773, 0.678, 0.719, 0.382, 0.319, 0.19 , 0.956, 0.084, 0.725, 0.299, 0.244, 0.745, 0.615, 1.321, 0.64 , 0.142, 0.374, 0.383, 0.578, 0.136, 0.395, 0.187, 0.905, 0.15 , 0.874, 0.236, 0.787, 0.407, 0.605, 0.151, 0.289, 0.355, 0.29 , 0.375, 0.164, 0.431, 0.742, 0.514, 0.464, 1.224, 1.072, 0.805, 0.209, 0.666, 0.101, 0.198, 0.652, 2.329, 0.089, 0.645, 0.238, 0.394, 0.293, 0.479, 0.686, 0.831, 0.582, 0.446, 0.402, 1.318, 0.329, 1.213, 0.427, 0.282, 0.143, 0.38 , 0.284, 0.249, 0.926, 0.557, 0.092, 0.655, 1.353, 0.612, 0.2 , 0.226, 0.997, 0.933, 1.101, 0.078, 0.24 , 1.136, 0.128, 0.422, 0.251, 0.677, 0.296, 0.454, 0.744, 0.881, 0.28 , 0.259, 0.619, 0.808, 0.34 , 0.434, 0.757, 0.613, 0.692, 0.52 , 0.412, 0.84 , 0.839, 0.156, 0.215, 0.326, 1.391, 0.875, 0.313, 0.433, 0.626, 1.127, 0.315, 0.345, 0.129, 0.527, 0.197, 0.731, 0.148, 0.123, 0.127, 0.122, 1.476, 0.166, 0.932, 0.343, 0.893, 0.331, 0.472, 0.673, 0.389, 0.485, 0.349, 0.279, 0.346, 0.252, 0.243, 0.58 , 0.559, 0.302, 0.569, 0.378, 0.385, 0.499, 0.306, 0.234, 2.137, 1.731, 0.545, 0.225, 0.816, 0.528, 0.509, 1.021, 0.821, 0.947, 1.268, 0.221, 0.66 , 0.239, 0.949, 0.444, 0.463, 0.803, 1.6 , 0.944, 0.196, 0.241, 0.161, 0.135, 0.376, 1.191, 0.702, 0.674, 1.076, 0.534, 1.095, 0.554, 0.624, 0.219, 0.507, 0.561, 0.421, 0.516, 0.264, 0.328, 0.233, 0.108, 1.138, 0.147, 0.727, 0.435, 0.497, 0.23 , 0.955, 2.42 , 0.658, 0.33 , 0.51 , 0.285, 0.415, 0.381, 0.832, 0.498, 0.212, 0.364, 1.001, 0.46 , 0.733, 0.416, 0.705, 1.022, 0.269, 0.6 , 0.571, 0.607, 0.17 , 0.21 , 0.126, 0.711, 0.466, 0.162, 0.419, 0.63 , 0.365, 0.536, 1.159, 0.629, 0.292, 0.145, 1.144, 0.174, 0.547, 0.163, 0.738, 0.314, 0.968, 0.409, 0.297, 0.525, 0.154, 0.771, 0.107, 0.493, 0.717, 0.917, 0.501, 1.251, 0.735, 0.804, 0.661, 0.549, 0.825, 0.423, 1.034, 0.16 , 0.341, 0.68 , 0.591, 0.3 , 0.121, 0.502, 0.401, 0.601, 0.748, 0.338, 0.43 , 0.892, 0.813, 0.693, 0.575, 0.371, 0.206, 0.417, 1.154, 0.925, 0.175, 1.699, 0.682, 0.194, 0.4 , 0.1 , 1.258, 0.482, 0.138, 0.593, 0.878, 0.157, 1.282, 0.141, 0.246, 1.698, 1.461, 0.347, 0.362, 0.393, 0.144, 0.732, 0.115, 0.465, 0.649, 0.871, 0.149, 0.695, 0.303, 0.61 , 0.73 , 0.447, 0.455, 0.133, 0.155, 1.162, 1.292, 0.182, 1.394, 0.217, 0.631, 0.88 , 0.614, 0.332, 0.366, 0.181, 0.828, 0.335, 0.856, 0.886, 0.439, 0.253, 0.598, 0.904, 0.483, 0.565, 0.118, 0.177, 0.176, 0.295, 0.441, 0.352, 0.826, 0.97 , 0.595, 0.317, 0.265, 0.646, 0.426, 0.56 , 0.515, 0.453, 0.785, 0.734, 1.174, 0.488, 0.358, 1.096, 0.408, 1.182, 0.222, 1.057, 0.766, 0.171])
In [106]:
IQ1R= dataset['DiabetesPedigreeFunction'].quantile(0.75)-dataset['DiabetesPedigreeFunction'].quantile(0.25)
IQ1R
Out[106]:
0.38249999999999995
In [107]:
upper_DPF=dataset['DiabetesPedigreeFunction'].quantile(0.75)+(IQ1R*3)
upper_DPF
Out[107]:
1.77375
In [108]:
dataset['DiabetesPedigreeFunction_mod']=dataset['DiabetesPedigreeFunction']
In [109]:
dataset.loc[dataset['DiabetesPedigreeFunction']>1.77375, 'DiabetesPedigreeFunction_mod']=1.77375
In [110]:
dataset.head()
Out[110]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | BMI_mod | DiabetesPedigreeFunction_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.0 | 6 | 148 | 72 | 35 | 23 | 33.6 | 0.62700 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 0.0 | 1 | 85 | 66 | 29 | 23 | 26.6 | 0.35100 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 0.0 | 8 | 183 | 64 | 2 | 23 | 23.3 | 0.67200 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1.0 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.16700 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 0.0 | 0 | 137 | 40 | 35 | 168 | 43.1 | 1.77375 |
In [111]:
dataset.shape
Out[111]:
(768, 17)
In [112]:
dataset.describe()
Out[112]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | BMI_mod | DiabetesPedigreeFunction_mod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 | 0.477865 | 3.520833 | 121.024740 | 70.977865 | 21.127604 | 89.735677 | 32.204036 | 0.469005 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 | 0.499835 | 2.836176 | 31.516899 | 13.725500 | 15.198534 | 101.188603 | 7.165761 | 0.317492 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 | 0.000000 | 0.000000 | 20.000000 | 40.000000 | 2.000000 | 23.000000 | 15.000000 | 0.078000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 | 0.000000 | 1.000000 | 99.000000 | 62.000000 | 2.000000 | 23.000000 | 27.300000 | 0.243750 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 | 0.000000 | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 | 1.000000 | 5.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 | 1.000000 | 10.000000 | 199.000000 | 122.000000 | 99.000000 | 509.000000 | 64.500000 | 1.773750 |
In [113]:
dataset.columns
Out[113]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod', 'Pregnancies_mod', 'Glucose_mod', 'BloodPressure_mod', 'SkinThickness_mod', 'Insulin_mod', 'BMI_mod', 'DiabetesPedigreeFunction_mod'], dtype='object')
In [114]:
col=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age']
for i in col:
dataset.drop([i], axis=1, inplace =
True)
In [115]:
dataset.head()
Out[115]:
Outcome | Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | BMI_mod | DiabetesPedigreeFunction_mod | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 6 | 148 | 72 | 35 | 23 | 33.6 | 0.62700 |
1 | 0 | 0.0 | 1 | 85 | 66 | 29 | 23 | 26.6 | 0.35100 |
2 | 1 | 0.0 | 8 | 183 | 64 | 2 | 23 | 23.3 | 0.67200 |
3 | 0 | 1.0 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.16700 |
4 | 1 | 0.0 | 0 | 137 | 40 | 35 | 168 | 43.1 | 1.77375 |
In [116]:
col=dataset.columns
In [117]:
for i in col:
print(i,'--->', dataset[i].min())
print(i,'--->', dataset[i].max())
print("---------------------------------------------")
Outcome ---> 0 Outcome ---> 1 --------------------------------------------- Age_mod ---> 0.0 Age_mod ---> 1.0 --------------------------------------------- Pregnancies_mod ---> 0 Pregnancies_mod ---> 10 --------------------------------------------- Glucose_mod ---> 20 Glucose_mod ---> 199 --------------------------------------------- BloodPressure_mod ---> 40 BloodPressure_mod ---> 122 --------------------------------------------- SkinThickness_mod ---> 2 SkinThickness_mod ---> 99 --------------------------------------------- Insulin_mod ---> 23 Insulin_mod ---> 509 --------------------------------------------- BMI_mod ---> 15.0 BMI_mod ---> 64.5 --------------------------------------------- DiabetesPedigreeFunction_mod ---> 0.078 DiabetesPedigreeFunction_mod ---> 1.77375 ---------------------------------------------
Model Building¶
In [118]:
X=dataset.drop('Outcome', axis=1)
Y=dataset['Outcome']
In [119]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.2,random_state=0)
Logistic Regression¶
In [121]:
lr=LogisticRegression()
In [122]:
%%time
lr.fit(X_train,Y_train)
CPU times: total: 31.2 ms Wall time: 102 ms
C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Out[122]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [123]:
pred_lr= lr.predict(X_test)
In [124]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
In [125]:
accuracy_lr=accuracy_score(Y_test, pred_lr)
print("Accuracy: %.2f%%" % (accuracy_lr * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_lr))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_lr))
Accuracy: 79.87% ----------------------------------------------- precision recall f1-score support 0 0.83 0.89 0.86 107 1 0.70 0.60 0.64 47 accuracy 0.80 154 macro avg 0.77 0.74 0.75 154 weighted avg 0.79 0.80 0.79 154 ----------------------------------------------- [[95 12] [19 28]]
In [126]:
from sklearn.metrics import roc_auc_score
# auc scores
auc_score1 = roc_auc_score(Y_test, pred_lr)
print(auc_score1*100)
74.17975740703918
Hyperparameter Tuning¶
In [127]:
param_grid= [{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
'C': np.logspace(-4,4,20),
'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
'max_iter': [100,500,1000,1500,2000,2500,5000]}]
In [128]:
from sklearn.model_selection import GridSearchCV
In [129]:
clf= GridSearchCV(lr, param_grid= param_grid, cv=3, verbose=True, n_jobs=-1)
In [131]:
%%time
best_clf= clf.fit(X_train, Y_train)
Fitting 3 folds for each of 2800 candidates, totalling 8400 fits CPU times: total: 8.03 s Wall time: 1min 25s
C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning: 3780 fits failed out of a total of 8400. The score on these train-test partitions for these parameters will be set to nan. If these failures are not expected, you can try to debug them by setting error_score='raise'. Below are more details about the failures: -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver raise ValueError( ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver raise ValueError( ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty. -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__ return super().__call__(iterable_with_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 1048, in __call__ if self.dispatch_one_batch(iterator): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 864, in dispatch_one_batch self._dispatch(tasks) File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 782, in _dispatch job = self._backend.apply_async(batch, callback=cb) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async result = ImmediateResult(func) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__ self.results = batch() ^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in __call__ return [func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in <listcomp> return [func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__ return self.function(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 521, in _logistic_regression_path alpha = (1.0 / C) * (1 - l1_ratio) ~~^~~~~~~~~~ TypeError: unsupported operand type(s) for -: 'int' and 'NoneType' -------------------------------------------------------------------------------- 420 fits failed with the following error: Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit solver = _check_solver(self.solver, self.penalty, self.dual) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver raise ValueError("penalty='none' is not supported for the liblinear solver") ValueError: penalty='none' is not supported for the liblinear solver warnings.warn(some_fits_failed_message, FitFailedWarning) C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [ nan nan 0.64006855 ... nan 0.73283118 0.74261916] warnings.warn(
In [132]:
best_clf.best_estimator_
Out[132]:
LogisticRegression(C=0.23357214690901212, solver='newton-cg')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.23357214690901212, solver='newton-cg')
In [133]:
best_clf.best_params_
Out[133]:
{'C': 0.23357214690901212, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
In [134]:
lr2=LogisticRegression(C= 4.281332398719396,penalty='l1',max_iter=100,solver= 'liblinear' )
In [135]:
lr2.fit(X_train,Y_train)
Out[135]:
LogisticRegression(C=4.281332398719396, penalty='l1', solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=4.281332398719396, penalty='l1', solver='liblinear')
In [136]:
pred_lr2= lr2.predict(X_test)
In [137]:
accuracy_lr2=accuracy_score(Y_test, pred_lr2)
print("Accuracy: %.2f%%" % (accuracy_lr2 * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_lr2))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_lr2))
Accuracy: 79.87% ----------------------------------------------- precision recall f1-score support 0 0.83 0.89 0.86 107 1 0.70 0.60 0.64 47 accuracy 0.80 154 macro avg 0.77 0.74 0.75 154 weighted avg 0.79 0.80 0.79 154 ----------------------------------------------- [[95 12] [19 28]]
In [138]:
auc_score = roc_auc_score(Y_test, pred_lr2)
print(auc_score)
0.7417975740703917
SVM¶
In [139]:
from sklearn import svm
In [140]:
sVm = svm.SVC()
In [141]:
%%time
sVm.fit(X_train,Y_train)
CPU times: total: 15.6 ms Wall time: 22 ms
Out[141]:
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [142]:
pred_svm= sVm.predict(X_test)
In [143]:
accuracy_svm=accuracy_score(Y_test, pred_svm)
print("Accuracy: %.2f%%" % (accuracy_svm * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_svm))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_svm))
Accuracy: 79.22% ----------------------------------------------- precision recall f1-score support 0 0.81 0.92 0.86 107 1 0.73 0.51 0.60 47 accuracy 0.79 154 macro avg 0.77 0.71 0.73 154 weighted avg 0.78 0.79 0.78 154 ----------------------------------------------- [[98 9] [23 24]]
In [144]:
param_grid1=[{'C': [1.0,2.0,0,5.0,6.0,7.0,10.0],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
'gamma':['scale', 'auto']
}]
In [145]:
from sklearn.model_selection import RandomizedSearchCV
In [147]:
svm1 = svm.SVC(kernel= 'Linear')
In [148]:
clf_svm_random= RandomizedSearchCV(estimator=svm1,param_distributions=param_grid1,cv=3, verbose=100, n_jobs=-1)
In [150]:
%%time
best_clf_svm= clf_svm_random.fit(X_train, Y_train)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker r = call_item() ^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__ return self.fn(*self.args, **self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__ return self.func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in __call__ return [func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in <listcomp> return [func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__ return self.function(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 678, in _fit_and_score X_train, y_train = _safe_split(estimator, X, y, train) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\metaestimators.py", line 227, in _safe_split raise ValueError("X should be a square kernel matrix") ValueError: X should be a square kernel matrix """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) File <timed exec>:1 File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:874, in BaseSearchCV.fit(self, X, y, groups, **fit_params) 868 results = self._format_results( 869 all_candidate_params, n_splits, all_out, all_more_results 870 ) 872 return results --> 874 self._run_search(evaluate_candidates) 876 # multimetric is determined here because in the case of a callable 877 # self.scoring the return type is only known after calling 878 first_test_score = all_out[0]["test_scores"] File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:1768, in RandomizedSearchCV._run_search(self, evaluate_candidates) 1766 def _run_search(self, evaluate_candidates): 1767 """Search n_iter candidates from param_distributions""" -> 1768 evaluate_candidates( 1769 ParameterSampler( 1770 self.param_distributions, self.n_iter, random_state=self.random_state 1771 ) 1772 ) File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:821, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results) 813 if self.verbose > 0: 814 print( 815 "Fitting {0} folds for each of {1} candidates," 816 " totalling {2} fits".format( 817 n_splits, n_candidates, n_candidates * n_splits 818 ) 819 ) --> 821 out = parallel( 822 delayed(_fit_and_score)( 823 clone(base_estimator), 824 X, 825 y, 826 train=train, 827 test=test, 828 parameters=parameters, 829 split_progress=(split_idx, n_splits), 830 candidate_progress=(cand_idx, n_candidates), 831 **fit_and_score_kwargs, 832 ) 833 for (cand_idx, parameters), (split_idx, (train, test)) in product( 834 enumerate(candidate_params), enumerate(cv.split(X, y, groups)) 835 ) 836 ) 838 if len(out) < 1: 839 raise ValueError( 840 "No fits were performed. " 841 "Was the CV iterator empty? " 842 "Were there no candidates?" 843 ) File ~\anaconda3\Lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable) 58 config = get_config() 59 iterable_with_config = ( 60 (_with_config(delayed_func, config), args, kwargs) 61 for delayed_func, args, kwargs in iterable 62 ) ---> 63 return super().__call__(iterable_with_config) File ~\anaconda3\Lib\site-packages\joblib\parallel.py:1061, in Parallel.__call__(self, iterable) 1058 self._iterating = False 1060 with self._backend.retrieval_context(): -> 1061 self.retrieve() 1062 # Make sure that we get a last message telling us we are done 1063 elapsed_time = time.time() - self._start_time File ~\anaconda3\Lib\site-packages\joblib\parallel.py:938, in Parallel.retrieve(self) 936 try: 937 if getattr(self._backend, 'supports_timeout', False): --> 938 self._output.extend(job.get(timeout=self.timeout)) 939 else: 940 self._output.extend(job.get()) File ~\anaconda3\Lib\site-packages\joblib\_parallel_backends.py:542, in LokyBackend.wrap_future_result(future, timeout) 539 """Wrapper for Future.result to implement the same behaviour as 540 AsyncResults.get from multiprocessing.""" 541 try: --> 542 return future.result(timeout=timeout) 543 except CfTimeoutError as e: 544 raise TimeoutError from e File ~\anaconda3\Lib\concurrent\futures\_base.py:449, in Future.result(self, timeout) 447 raise CancelledError() 448 elif self._state == FINISHED: --> 449 return self.__get_result() 451 self._condition.wait(timeout) 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: File ~\anaconda3\Lib\concurrent\futures\_base.py:401, in Future.__get_result(self) 399 if self._exception: 400 try: --> 401 raise self._exception 402 finally: 403 # Break a reference cycle with the exception in self._exception 404 self = None ValueError: X should be a square kernel matrix
In [151]:
X_test.head()
Out[151]:
Age_mod | Pregnancies_mod | Glucose_mod | BloodPressure_mod | SkinThickness_mod | Insulin_mod | BMI_mod | DiabetesPedigreeFunction_mod | |
---|---|---|---|---|---|---|---|---|
661 | 1.0 | 1 | 199 | 76 | 43 | 23 | 42.9 | 1.394 |
122 | 1.0 | 2 | 107 | 74 | 30 | 100 | 33.6 | 0.404 |
113 | 1.0 | 4 | 76 | 62 | 2 | 23 | 34.0 | 0.391 |
14 | 0.0 | 5 | 166 | 72 | 19 | 175 | 25.8 | 0.587 |
529 | 0.0 | 0 | 111 | 65 | 2 | 23 | 24.6 | 0.660 |
Decision Trees¶
In [152]:
from sklearn import tree
In [153]:
dt = tree.DecisionTreeClassifier()
#clf = clf.fit(X, Y)
In [154]:
param_grid_dt= [{'criterion': ["gini", "entropy"],
'splitter': ["best", "random"],
'max_depth': [5,10,20,25,30,50],
'max_features': [2,4,6,8,10,"auto", "sqrt", "log2"]}]
In [155]:
clf_svm_random= RandomizedSearchCV(estimator=dt,param_distributions=param_grid_dt,cv=5, verbose=100, n_jobs=-1)
In [156]:
%%time
clf_svm_random.fit(X_train, Y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits CPU times: total: 203 ms Wall time: 5.82 s
Out[156]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1, param_distributions=[{'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 20, 25, 30, 50], 'max_features': [2, 4, 6, 8, 10, 'auto', 'sqrt', 'log2'], 'splitter': ['best', 'random']}], verbose=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1, param_distributions=[{'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 20, 25, 30, 50], 'max_features': [2, 4, 6, 8, 10, 'auto', 'sqrt', 'log2'], 'splitter': ['best', 'random']}], verbose=100)
DecisionTreeClassifier()
DecisionTreeClassifier()
In [157]:
clf_svm_random.best_estimator_
Out[157]:
DecisionTreeClassifier(max_depth=5, max_features=10, splitter='random')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=5, max_features=10, splitter='random')
In [158]:
clf_svm_random.best_params_
Out[158]:
{'splitter': 'random', 'max_features': 10, 'max_depth': 5, 'criterion': 'gini'}
Decision Tree¶
In [159]:
%%time
dt1 = tree.DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=10,max_features='log2')
dtfit = dt1.fit(X_train, Y_train)
CPU times: total: 0 ns Wall time: 4 ms
In [160]:
pred_dt= dtfit.predict(X_test)
In [161]:
accuracy_dt=accuracy_score(Y_test, pred_dt)
print("Accuracy: %.2f%%" % (accuracy_dt * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_dt))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_dt))
Accuracy: 72.73% ----------------------------------------------- precision recall f1-score support 0 0.80 0.81 0.81 107 1 0.56 0.53 0.54 47 accuracy 0.73 154 macro avg 0.68 0.67 0.67 154 weighted avg 0.72 0.73 0.73 154 ----------------------------------------------- [[87 20] [22 25]]
In [ ]: